MacTech 1 to 12

home *** CD-ROM | disk | FTP | other *** search

/ MacTech 1 to 12 / MacTech-vol-1-12.toast / Source / MacTech® Magazine / Volume 11 - 1995 / 11.10 Oct 95 / Diff-Warrior.c < prev next >

Wrap

C/C++ Source or Header | 1995-08-14 | 24.8 KB | 862 lines | [TEXT/Rich]

/* 11 August 1995. Submission to MacTech Programmer's challenge August 1995. Copyright 1995, Ernst Munter, Kanata, ON, Canada. The problem and the solution: --------------------------------- Given two texts, compute a series of insert/delete/move instructions that will describe the conversion of one text into the other. I parse the old text, and create a sequential word list. The words are further linked into lists, accessed through a hash table. Then, I parse the second text, and for each word in the second text, I try to find a matching word in the first text. The hashed table of lists helps me find the first matching word quickly. I remove it from the list, and mark it for a potential move operation. Each matching word found in the first text may either remain in place, or must be moved. There exists an algorithm to determine the best set of words (most characters) to leave undisturbed, and so minimize the amount to be moved. However, I did not have the patience to work this out fully. So, as a compromise solution, I just go sequentially through the texts together, and make an educated guess for each block of contiguous matching words, to decide whether to leave or move them. Any words encountered in the new text that are not found in the old text are immediately marked for "insert". After the whole text is scanned, any words or blocks left behind in the hash lists, are destined for deletion. Before the parsing, a character by character comparison of the two texts is done from each end, to cut off all text which is equal and does not need any further analysis. This may result in the discovery that both texts are identical. All other special cases, such as only a single change at one end or in the middle of one of the texts, will be automatically handled. Before returning the DiffRecs to the caller, I analyze pairs of inserts and deletes to eliminate redundancies. The program will allocate a fair amount of memory to build the word lists in. This can be minimized by doing a word count first. But I prefer to just provide a conservative factor (4 chars per word, white space included) which should be enough for normal texts. Please use the "countWords" directive if memory must be conserved. I have allocated a fixed static hash table of 997 entries. With 65000 bytes, and say, 6.5 bytes per word really, each list would contain 10 entries on the average. hashMod can easily be increased to some larger prime number to speed up word lookup for large files. I have also provided a lookup table for determining what characters are considered parts of words, and which are white space. Include foreign letters and digits if that is desirable. If the texts are computer programs, underscore and other symbols might be included in "alpha", depending on the language. */ #include <stdio.h> #include <stdlib.h> #define ulong unsigned long #define ushort unsigned short typedef enum { deletedText=0, insertedText, movedText } DiffType; typedef struct { DiffType type; ulong rangeStart; ulong rangeEnd; ulong position; } DiffRec; short FindWordDifferences ( char *oldText, char *newText, ulong numOldChars, ulong numNewChars, DiffRec diffs[], ulong maxDiffRecs); #define countWords 0 #if countWords /*use the actual word count to reserve Snippet space*/ #else /*use a very conservative estimate to reserve enough*/ #define averageWordSize 4 #endif #define hashMod 997 //any reasonable prime number #define inDelete 0 //state values during text scan #define inMove 1 #define inInsert 2 #define inLimbo 3 #define nullRec 3 //used to mark redundant DiffRecs /*A word is defined as a contiguous sequence of letters from the set defined in the lookup table. The table defaults to 'A'-'Z','a'-'z', but foreign characters or digits are easily included if desired. */ #define includeForeign 0 #define includeDigits 0 static char lookup[256] = { 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, //control 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, //control 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, //punctuation #if includeDigits 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, //digits plus #else 1,1,1,1,1,1,1,1, 1,1,0,0,0,0,0,0, //digits plus #endif 0,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, //caps A-O 1,1,1,1,1,1,1,1, 1,1,1,0,0,0,0,0, //caps P-Z 0,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, //small a-o 1,1,1,1,1,1,1,1, 1,1,1,0,0,0,0,0, //small p-z #if includeForeign 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, //foreign caps 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, //foreign small 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, //symbols 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, //symbols 0,0,0,0,0,0,0,0, 0,0,0,1,1,1,1,1, //symbols plus 0,0,0,0,0,0,0,0, 1,1,0,0,0,0,0,0, //symbols plus #else 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, //foreign caps 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, //foreign small 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, //symbols 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, //symbols 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, //symbols plus 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, //symbols plus #endif 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, //graphic chars 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0 //graphic chars }; #define isAlpha(x) lookup[x] /*A snippet may describe a word or a block of words in the old text and includes the white space. The reference to the new text is used to track the equivalent position in the new text. */ typedef struct Snippet { DiffType type; char* oldTextRef; char* newTextRef; ulong length; void* next; } Snippet; /*The hashTable is an array of lists where each list contains the snippets (words or blocks) from the old text which hash to the same value, in the order in which they occured. */ typedef struct List { Snippet* first; Snippet* last; } List; static List hashTable[hashMod]; static Snippet* snippetStore; //allocated as needed static ushort numSnippets; static ushort nextFreeSnippet; /*The hash function is used to distribute different snippets into different lists as evenly as possible. */ ushort Hash(char* text,ulong numChars) { register ulong accumulator=numChars; if (numChars>6) numChars=6; switch (numChars) { case 6:accumulator=(accumulator<<5)+*text++; case 5:accumulator=(accumulator<<5)+*text++; case 4:accumulator=(accumulator<<5)+*text++; case 3:accumulator=(accumulator<<5)+*text++; case 2:accumulator=(accumulator<<5)+*text++; case 1:accumulator=(accumulator<<5)+*text++; case 0:; } return accumulator % hashMod; } /*Clears all snippets to 0*/ void ClearSnippetStore() { Snippet* s=snippetStore; ushort n=numSnippets-1; s->type=deletedText; s->oldTextRef=0; s->newTextRef=0; s->length=0; s->next=0; s++; while (n--) *s++=*snippetStore; } /*Preallocates an array of snippets to handle oldText. All snippets are initially marked deletedText. */ void GetSnippetStore(ulong snippetsAllocated) { ulong memoryRequired; if (snippetsAllocated>65535) snippetsAllocated=65535; numSnippets=(ushort)snippetsAllocated; memoryRequired=numSnippets*sizeof(Snippet); snippetStore=(Snippet*)malloc(memoryRequired); ClearSnippetStore(); nextFreeSnippet=0; } /*Assign the next snippet from the preallocated array.*/ Snippet* NewSnippet(char* text) { register Snippet* s; s=&(snippetStore[nextFreeSnippet++]); s->oldTextRef=text; return s; } /*Consecutive Snippets are merged into larger blocks*/ int ExtendSnippet(Snippet* oldS,Snippet* s) { if (oldS->oldTextRef+oldS->length==s->oldTextRef) { oldS->length+=s->length; s->length=0; return 1; } return 0; } /*Snippets are attached at end of a list (hashTable[])*/ void Record(Snippet* s) { register List* list= &(hashTable[Hash(s->oldTextRef,s->length)]); register Snippet* lastS=list->last; if (lastS) lastS->next=s; else list->first=s; list->last=s; } /*Matches two substrings of length numChars*/ int Match(char* text0,char* text1,ulong numChars) { while (numChars--) { if (*text0!=*text1) return 0; text0++; text1++; } return 1; } /*Snippets of the oldText are matched against a word from newText. The first matching word is removed from the hash list. */ Snippet* FindAndRemoveSnippet(char* text,ulong numChars) { List* list=&(hashTable[Hash(text,numChars)]); Snippet* s=list->first; Snippet* father=0; while (s) { if ((s->length==numChars) && Match(s->oldTextRef,text,numChars)) break; father=s; s=(Snippet*)(s->next); } if (father) { father->next=s->next; if (list->last==s) list->last=father; } else list->first=list->last=0; return s; } /*The following macros set up the 3 different types of DiffRecs. */ #define StartMoveRecord \ { diffPtr->type=movedText; \ diffPtr->rangeStart=block->oldTextRef-oldText; \ diffPtr->rangeEnd=diffPtr->rangeStart+block->length-1;\ diffPtr->position=startOfText->oldTextRef+ \ startOfText->length-oldText; \ } #define StartInsertRecord \ { diffPtr->type=insertedText; \ diffPtr->rangeStart=text-newText; \ diffPtr->rangeEnd=diffPtr->rangeStart+wordLength-1; \ diffPtr->position=startOfText->oldTextRef+ \ startOfText->length-oldText; \ } #define StartDeleteRecord \ { diffPtr->type=deletedText; \ diffPtr->rangeStart=block->oldTextRef-oldText; \ diffPtr->rangeEnd=diffPtr->rangeStart+block->length-1;\ diffPtr->position=block->newTextRef-newText; \ } /*The following macro marks all words from startOfText to "toHere" as potential candidates for delete. Any of the words will be overwritten and become movedText if they end up being matched in newText later. */ #define MarkDeletedWords(toHere) \ { canDel=startOfText; \ while (++canDel<toHere) \ if (canDel->type==deletedText) \ canDel->newTextRef=markNew; \ } /* A matching word may be found in the old text at a point far beyond the current insertion point. It would be a shame to declare this word the new insertion point, and then have to move all intervening words (yet to be matched). It is better to move the smaller block or word up. This macro is a heuristic attempt to make a sensible guess as to which block is larger, the matching word (100%) or the intervening text (adjusted according to the ratio of remaining chars still required for matching, at 50%). */ #define BestGuess \ (skippedChars*(remainingNew>>1)> \ block->length*remainingOld) /*Words are extracted by first scanning through any preceding white space, then by scanning through the alpha characters, until another white space occurs. It is assumed that \x0 does not occur as part of either text. The last character in the text is then temporarily set to 0 so we easily find the end of text. */ #define NextWord \ { while ((*t)&&(0==isAlpha(*t))) t++; \ while (0!=isAlpha(*t)) t++; \ } #define EmitRecord \ { diffPtr++; \ if (diffPtr-diffs>=maxDiffRecs) return maxDiffRecs; \ } #define ClearHashTable \ { ushort n=hashMod; \ List* H=hashTable; \ while (n--) { \ H->first=0; \ H->last=0; \ H++; \ } \ } /*The function ParseOldText scans the oldText and creates an array of snippets, where each snippet corresponds to one word. In addition, each snippet is linked into a list, where each list is headed by a List pointer in hashTable. The last character of the text is temporarily set to 0 as an end-of-text marker. */ void ParseOldText( char* oldText, ulong numSameHead, ulong numChars) { Snippet* s; char* text=oldText+numSameHead; char* t; char lastChar=text[numChars-1]; //save it text[numChars-1]=0; ClearHashTable; t=text; NewSnippet(text); //A null snippet anchors the start of the text. do { NextWord; s=NewSnippet(text); s->length=t-text; if (0==*t) break; Record(s); text=t; } while (1); *t=lastChar; //restore the last char s->length++; Record(s); } /* The ParseNewText function scans the newText and tries to match it with the oldText. It proceeds by isolating words, and matching them against previously created Snippets (see ParseOldText). It is a state machine which tries to agglomerate contiguous blocks of words while it is in either the insert or move state. When it switches states it creates a DiffRec for the preceding block. It moves the "startOfText" in the oldText along as it scans the newText and encounters matching blocks. If the accumulated matching (movedText) block lies before the current startOfText, or if it is small and lies far forward, it is made into a movedText record. But otherwise, it stays un-moved, and becomes the new startOfText. Blocks of contiguous words assembled during the insert state (i.e. no matching words in oldText) are also accumulated and result in insertedText records. DeletedText records are created later, after all text has been scanned, by collecting the leftovers in oldText, (see CollectDeletes below). */ short ParseNewText( //returns number of inserted DiffRecs char* oldText, char* newText, DiffRec* diffs, ulong numSameHead, ulong numOldChars, ulong numNewChars, ulong maxDiffRecs) { Snippet* s; Snippet* canDel; Snippet* block; Snippet* startOfText=snippetStore;//null snippet Snippet* endOfText= NewSnippet(oldText+numSameHead+numOldChars); char* text=newText+numSameHead; char* markNew=text; char* t; DiffRec* diffPtr=diffs; ulong wordLength; ulong remainingNew=numNewChars; ulong remainingOld=numOldChars; long skippedChars; // signed ! int state=inLimbo; char lastChar; char done=0; if (0==numNewChars) goto cleanup; lastChar=text[numNewChars-1]; text[numNewChars-1]=0; do { t=text; NextWord; wordLength=t-text; if (0==*t) { *t=lastChar; //restore last char wordLength++; done=1; } if (0!=(s=FindAndRemoveSnippet(text,wordLength))) { s->type=movedText; remainingOld-=s->length; remainingNew-=s->length; switch (state) { case inMove: if (0==ExtendSnippet(block,s)) { skippedChars=block->oldTextRef- startOfText->oldTextRef-startOfText->length; if ((skippedChars<0) || BestGuess) { StartMoveRecord; EmitRecord; } else { MarkDeletedWords(block); startOfText=block; } block=s; } break; case inInsert: EmitRecord; case inLimbo: block=s; markNew=text; state=inMove; } } else { remainingNew-=wordLength; switch (state) { case inInsert: diffPtr->rangeEnd+=wordLength; break; case inMove: skippedChars=block->oldTextRef- startOfText->oldTextRef-startOfText->length; if ((skippedChars<0) || BestGuess) { StartMoveRecord; EmitRecord; } else { MarkDeletedWords(block); startOfText=block; } case inLimbo: StartInsertRecord; state=inInsert; } } text=t; } while (0==done); cleanup: switch (state) { case inMove: skippedChars=block->oldTextRef- startOfText->oldTextRef-startOfText->length; if (skippedChars<0) { StartMoveRecord; } else { MarkDeletedWords(endOfText); break; } case inInsert: EmitRecord; case inLimbo: MarkDeletedWords(endOfText); } return diffPtr-diffs; } /*The CollectDeletes function scans through the entire snippet array, and detects all snippets still marked as deletedText. Contiguous blocks of these are assembled to generate deletedText DiffRecs. */ short CollectDeletes( DiffRec* diffs, char* oldText, char* newText, ulong maxDiffRecs) { DiffRec* diffPtr=diffs; Snippet* s=snippetStore; Snippet* block; ushort n=nextFreeSnippet; int state=inLimbo; while (n--) { if (s->length) { if (s->type==deletedText) { if (state==inLimbo) { block=s; state=inDelete; } else block->length+=s->length; } else if (state==inDelete) { StartDeleteRecord; EmitRecord; state=inLimbo; } } s++; } //cleanup: if (state==inDelete) { StartDeleteRecord; diffPtr++; } return diffPtr-diffs; } #if countWords /*This function could be used to tailor make the number of snippets in snippetStore to the exact number required */ short WordCount(char* text,ulong numChars) { register char* t=text; short WC=1; char lastChar; if (numChars<=1) return numChars; lastChar=t[numChars-1]; t[numChars-1]=0; do { NextWord; WC++; } while (*t); *t=lastChar; return WC; } #endif /*Null records can result from cancelling common parts of insert/delete pairs, for example insert " [The" and delete " The" would become just insert "[". The delete record would have a length of 0, i.e. a rangeEnd below rangeStart. If this occurred at the start of text, the value 0xFFFFFFFF would occur for rangeEnd, surely not a good thing. It is surely best to eliminate those null records. */ void ExtractNullRecs(DiffRec* diffs,short numDiffRecs) { DiffRec* d=diffs+numDiffRecs; short i=numDiffRecs; short numMove; while (i--) { d--; if (d->type==nullRec) { DiffRec* dx=d+1; numDiffRecs--; numMove=numDiffRecs-i; while (numMove--) *d++=*dx++; numMove=i; } } } /*Since all snippets (and hence DiffRecs) refer to words which usually include leading white space, the might differ as such, but have common white space with different words, or vice versa. The ReduceInsDelPairs attempts to match pairs at the same text location, and remove their common parts. The loop relies on the existing order of records, as created by ParseNewText, followed by CollectCeletes. This means delete records are at the end, and all are in the order of the text. Ins and Del records are compared if they apply to the same insertion/ deletion point, and share common words or white space at either end of the block. The aggregation of blocks hurts sometimes, in that redundancies in the middle will not be found here. */ short ReduceInsDelPairs( DiffRec* diffs, short numDiffRecs, char* oldText, char* newText) { DiffRec* dRec=diffs+numDiffRecs-1; DiffRec* iRec=dRec; short eliminated=0; while (iRec>diffs) { iRec--; if (insertedText==iRec->type) break; } while (dRec>diffs) { top_of_loop: if (deletedText!=dRec->type) break; if (dRec->rangeStart>iRec->position) { dRec--; continue; } if (dRec->rangeStart==iRec->position) { //take care of disposable common chars at START of block do { ulong dStart=dRec->rangeStart; ulong iStart=iRec->rangeStart; ulong n=0; ulong n0=0; //take care of common white space: while (oldText[n+dStart]==newText[n+iStart]) { if (isAlpha(oldText[n+dStart])) break; n++; } //take care of common alpha but it must be a whole word: n0=n; while (oldText[n+dStart]==newText[n+iStart]) { if (isAlpha(oldText[n+dStart])) { n++; if (0==isAlpha(oldText[n+dStart])) {//success n0=n; break; } } } if (n0) { dRec->position+=dStart-dRec->rangeStart+n0; dRec->rangeStart=dStart+n0; iRec->position+=iStart-iRec->rangeStart+n0; iRec->rangeStart=iStart+n0; } else break; } while (oldText[dRec->rangeStart]== newText[iRec->rangeStart]); //take care of disposable common chars at END of block do { ulong dEnd=dRec->rangeEnd; ulong iEnd=iRec->rangeEnd; ulong n=0; ulong n0=0; //take care of common alpha but it must be a whole word: while (oldText[dEnd-n]==newText[iEnd-n]) { if (isAlpha(oldText[dEnd-n])) { n++; if (0==isAlpha(oldText[dEnd-n])) {//success n0=n; break; } } } n=n0; //take care of common white space: while (oldText[dEnd-n]==newText[iEnd-n]) { if (isAlpha(oldText[dEnd-n])) break; n++; } if (n) { iRec->rangeEnd=iEnd-n; dRec->rangeEnd=dEnd-n; } else break; } while (oldText[dRec->rangeEnd]== newText[iRec->rangeEnd]); if ((long)(dRec->rangeStart)>(long)(dRec->rangeEnd)) { dRec->type=nullRec; eliminated++; } if ((long)(iRec->rangeStart)>(long)(iRec->rangeEnd)) { iRec->type=nullRec; eliminated++; } } else while (iRec>diffs) { //find next lower iRec iRec--; if (insertedText==iRec->type) goto top_of_loop; } dRec--; } if (eliminated) ExtractNullRecs(diffs,numDiffRecs); return eliminated; } /*The FindWordDifferences function is primarily a collection of calls to the other routines. In addition, it tries to eliminate parts of the texts from processing which are completely identical at the start or the tail ends of the texts. If an insufficient number of DiffRecs was allocated, and maxDiffRecs is reached while processing the texts, the function returns this value without further ado. */ short FindWordDifferences ( char *oldText, /* pointer to old version of text */ char *newText, /* pointer to new version of text */ ulong numOldChars, /* number of characters in oldText */ ulong numNewChars, /* number of characters in newText */ DiffRec diffs[], /* pointer to preallocated array where text differences are to be stored */ ulong maxDiffRecs /* number of DiffRecs preallocated */ ) { ulong numSameHead,numSameTail; long numUniqueOldChars; long numUniqueNewChars; short numDiffRecs=0; register char* OTx=oldText; register char* NTx=newText; register char* OTlimit; long deltaChars=numOldChars-numNewChars; if (deltaChars>0) OTlimit=oldText+numNewChars; else OTlimit=oldText+numOldChars; //check for common head: while (OTx<OTlimit) { if (*OTx != *NTx) break; OTx++; NTx++; } if ((isAlpha(*OTx)) || (isAlpha(*NTx))) { //backtrack to start of word while ((OTx>oldText) && (isAlpha(OTx[-1]))) OTx--; if (OTx>oldText) OTx--; //grab previous WSP } numSameHead=OTx-oldText; //check for common tail: OTx=oldText+numOldChars-1; NTx=newText+numNewChars-1; if (deltaChars>0) OTlimit=oldText+deltaChars; else OTlimit=oldText; while (OTx>OTlimit) { if (*OTx != *NTx) break; OTx--; NTx--; } if ((isAlpha(*OTx)) || (isAlpha(*NTx))) { //track to end of word while ((OTx<oldText+numOldChars-1) && (isAlpha(OTx[1]))) OTx++; } numSameTail=oldText-OTx+numOldChars-1; numUniqueOldChars=numOldChars-numSameTail-numSameHead; numUniqueNewChars=numNewChars-numSameTail-numSameHead; /*These values may be negative! If both are negative, old and new texts are identical, and the function can return immediately. */ if ((numUniqueOldChars<0) && (numUniqueNewChars<0)) return 0; #if countWords GetSnippetStore(3+ WordCount(oldText+numSameHead,numUniqueOldChars)); #else GetSnippetStore(3+ numUniqueOldChars/averageWordSize); #endif if (numUniqueOldChars>0) ParseOldText(oldText,numSameHead,numUniqueOldChars); numDiffRecs+= ParseNewText(oldText,newText,diffs, numSameHead,numUniqueOldChars,numUniqueNewChars, maxDiffRecs); numDiffRecs+= CollectDeletes(&diffs[numDiffRecs],oldText,newText, maxDiffRecs-numDiffRecs); numDiffRecs-= ReduceInsDelPairs(diffs,numDiffRecs,oldText,newText); free(snippetStore); return numDiffRecs; }